library(dplyr)
library(readr)
library(ggplot2)
library(openxlsx)
library(knitr)
library(tibble)
library(stringr)
library(stringi)
library(readxl)
library(lubridate)
library(shiny)
library(plotly)
library(ruODK)
Loading the Data and Removal of Training Data
# Unzip and extract ODK data from ODK zip
df <- export_load_from_odk(params$svc)
## <ruODK settings>
## Default ODK Central Project ID: 2
## Default ODK Central Form ID: 07-TIMCI-timeflow
## Default ODK Central URL: https://timicodktest.smartforest.de
## Default ODK Central Username: lucas.silbernagel@swisstph.ch
## Default ODK Central Password: run ruODK::get_default_pw() to show
## Default ODK Central Passphrase: run ruODK::get_default_pp() to show
## Default Time Zone: Europe/Berlin
## Default ODK Central Version: 1.1
## Default HTTP GET retries: 3
## Verbose messages: TRUE
## Test ODK Central Project ID:
## Test ODK Central Form ID:
## Test ODK Central Form ID (ZIP tests):
## Test ODK Central Form ID (Attachment tests):
## Test ODK Central Form ID (Parsing tests):
## Test ODK Central Form ID (WKT tests):
## Test ODK Central URL:
## Test ODK Central Username:
## Test ODK Central Password: run ruODK::get_test_pw() to show
## Test ODK Central Passphrase: run ruODK::get_test_pp() to show
## Test ODK Central Version: 1.1
# Formatting dates from integer (in ms) to time stamp
df$start <- format_date_ms(df$start)
df$end <- format_date_ms(df$end)
head(df)
| uuid:2985578d-e410-4a14-bd31-f6813536d5c8 |
form start |
NA |
2021-01-23 20:50:14 |
NA |
NA |
NA |
NA |
NA |
NA |
| uuid:2985578d-e410-4a14-bd31-f6813536d5c8 |
group questions |
/data/front_page |
2021-01-23 20:50:14 |
2021-01-23 20:50:18 |
NA |
NA |
NA |
NA |
NA |
| uuid:2985578d-e410-4a14-bd31-f6813536d5c8 |
group questions |
/data/visit_start |
2021-01-23 20:50:18 |
2021-01-23 20:50:22 |
NA |
NA |
NA |
NA |
NA |
| uuid:2985578d-e410-4a14-bd31-f6813536d5c8 |
question |
/data/visit_start/b1_4 |
2021-01-23 20:50:18 |
2021-01-23 20:50:22 |
NA |
NA |
NA |
NA |
Joal |
| uuid:2985578d-e410-4a14-bd31-f6813536d5c8 |
question |
/data/steps[1]/step_type |
2021-01-23 20:50:22 |
2021-01-23 20:50:27 |
NA |
NA |
NA |
NA |
1 |
| uuid:2985578d-e410-4a14-bd31-f6813536d5c8 |
question |
/data/steps[2]/step_type |
2021-01-23 20:50:27 |
2021-01-23 20:50:31 |
NA |
NA |
NA |
NA |
1 |
# filtering for events that occurred after 18th July 21
#df <- subset(df, as.Date(start) > as.Date("18.07.2021", "%d.%m.%Y"))
Deriving New Features
Time Spent per Event
# subtracting end from start date
df$time_spent = round(as.numeric(df$end - df$start))
Question
# splitting the node strings so that only the question name remains
df$question = sapply(df$node, create_question)
Question Decoded
df <- decode_question(df, df$question, params$svc)
## <ruODK settings>
## Default ODK Central Project ID: 2
## Default ODK Central Form ID: 07-TIMCI-timeflow
## Default ODK Central URL: https://timicodktest.smartforest.de
## Default ODK Central Username: lucas.silbernagel@swisstph.ch
## Default ODK Central Password: run ruODK::get_default_pw() to show
## Default ODK Central Passphrase: run ruODK::get_default_pp() to show
## Default Time Zone: Europe/Berlin
## Default ODK Central Version: 1.1
## Default HTTP GET retries: 3
## Verbose messages: TRUE
## Test ODK Central Project ID:
## Test ODK Central Form ID:
## Test ODK Central Form ID (ZIP tests):
## Test ODK Central Form ID (Attachment tests):
## Test ODK Central Form ID (Parsing tests):
## Test ODK Central Form ID (WKT tests):
## Test ODK Central URL:
## Test ODK Central Username:
## Test ODK Central Password: run ruODK::get_test_pw() to show
## Test ODK Central Passphrase: run ruODK::get_test_pp() to show
## Test ODK Central Version: 1.1
Categorical Answers Decoded
df <- decode_categories(df, params$svc)
## <ruODK settings>
## Default ODK Central Project ID: 2
## Default ODK Central Form ID: 07-TIMCI-timeflow
## Default ODK Central URL: https://timicodktest.smartforest.de
## Default ODK Central Username: lucas.silbernagel@swisstph.ch
## Default ODK Central Password: run ruODK::get_default_pw() to show
## Default ODK Central Passphrase: run ruODK::get_default_pp() to show
## Default Time Zone: Europe/Berlin
## Default ODK Central Version: 1.1
## Default HTTP GET retries: 3
## Verbose messages: TRUE
## Test ODK Central Project ID:
## Test ODK Central Form ID:
## Test ODK Central Form ID (ZIP tests):
## Test ODK Central Form ID (Attachment tests):
## Test ODK Central Form ID (Parsing tests):
## Test ODK Central Form ID (WKT tests):
## Test ODK Central URL:
## Test ODK Central Username:
## Test ODK Central Password: run ruODK::get_test_pw() to show
## Test ODK Central Passphrase: run ruODK::get_test_pp() to show
## Test ODK Central Version: 1.1
Time until a Response was Changed + Stream of Answer Changes
df <- df %>%
# bringing the data in the right order
arrange(instance.ID, node, start) %>%
# adding two empty columns to store the new features in
add_column(time_till_change=NA) %>%
add_column(changed_from=NA)
# iterating over the df and computing the time it took until an answer was changed + adding what the question was before
for (i in 1:nrow(df)){
if (df$old.value[i]==df$new.value[i-1] && !is.na(df$old.value[i]) && !is.na(df$new.value[i-1]) ){
df$time_till_change[i] <- round(as.numeric(df$start[i]-df$end[i-1]))
} else{
next
}
}
Preview and Summary of the Final Data
head(df)
| uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a |
group questions |
/data/child_identification |
2021-04-23 18:59:31 |
2021-04-23 19:00:00 |
NA |
NA |
NA |
NA |
NA |
29 |
child_identification |
child_identification |
NA |
NA |
NA |
NA |
| uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a |
question |
/data/child_identification/a1_a_4a |
2021-04-23 18:59:31 |
2021-04-23 19:00:00 |
NA |
NA |
NA |
NA |
S-F009-P0035 |
29 |
a1_a_4a |
If QR code scanning is not possible, please manually enter the participant identification code |
S-F009-P0035 |
NA |
NA |
NA |
| uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a |
group questions |
/data/front_page |
2021-04-23 18:58:54 |
2021-04-23 18:58:55 |
NA |
NA |
NA |
NA |
NA |
1 |
front_page |
front_page |
NA |
NA |
NA |
NA |
| uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a |
question |
/data/steps[1]/step_type |
2021-04-23 18:59:03 |
2021-04-23 18:59:06 |
NA |
NA |
NA |
NA |
3 |
3 |
step_type |
step_type |
triage |
NA |
NA |
NA |
| uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a |
question |
/data/steps[2]/step_type |
2021-04-23 18:59:06 |
2021-04-23 18:59:09 |
NA |
NA |
NA |
NA |
5 |
3 |
step_type |
step_type |
laboratory testing |
NA |
NA |
NA |
| uuid:0e0117d5-cc64-4d13-a614-3a16d5ad672a |
question |
/data/steps[3]/step_type |
2021-04-23 18:59:09 |
2021-04-23 18:59:12 |
NA |
NA |
NA |
NA |
2 |
2 |
step_type |
step_type |
waiting |
NA |
NA |
NA |
summary(df)
## instance.ID event node start
## Length:64 Length:64 Length:64 Min. :2021-01-23 20:50:14
## Class :character Class :character Class :character 1st Qu.:2021-01-30 01:15:55
## Mode :character Mode :character Mode :character Median :2021-02-09 08:15:54
## Mean :2021-02-23 02:09:42
## 3rd Qu.:2021-04-23 18:58:54
## Max. :2021-04-23 19:00:03
##
## end latitude longitude accuracy old.value new.value
## Min. :2021-01-23 20:50:18 Mode:logical Mode:logical Mode:logical Min. :1 Length:64
## 1st Qu.:2021-01-30 01:16:16 NA's:64 NA's:64 NA's:64 1st Qu.:1 Class :character
## Median :2021-02-09 08:50:44 Median :1 Mode :character
## Mean :2021-02-23 18:49:15 Mean :1
## 3rd Qu.:2021-04-23 18:58:57 3rd Qu.:1
## Max. :2021-04-23 19:00:03 Max. :1
## NA's :16 NA's :63
## time_spent question question_decoded new_value_decoded old_value_decoded
## Min. : 1.00 Length:64 Length:64 Length:64 Length:64
## 1st Qu.: 3.75 Class :character Class :character Class :character Class :character
## Median : 8.00 Mode :character Mode :character Mode :character Mode :character
## Mean : 60.42
## 3rd Qu.: 18.00
## Max. :2073.00
## NA's :16
## time_till_change changed_from
## Mode:logical Mode:logical
## NA's:64 NA's:64
##
##
##
##
##
Grouped by Time
Events/Questions Started by Day
df_by_day <- df %>%
mutate(start_date = as.Date(start)) %>%
count(start_date, name = "count")
gg1 <- ggplot(df_by_day, aes(x = start_date, y = count)) +
geom_line() +
geom_smooth(alpha=0.5, colour="red", method="loess", se=F) +
labs(title = "Number of Events/Questions Started by Day with Smoothed Regression Line", y = "Number of Questions/Events Started", x = "Satrt Date") +
theme_light()
gg1

Questions/Events started by Weekday and Hour of the Day
df_wday_hour <- df %>%
mutate(wday=wday(start, label=T, week_start = 1), hour=hour(start)) %>%
count(wday, hour, name="count_wday_hour") %>%
arrange(desc(wday))
theme_heatmap <- theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size=10),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10),
legend.position = "none")
gg2 <- ggplot(df_wday_hour, aes(x = wday, y = hour, fill = count_wday_hour)) +
geom_tile(colour="white") +
scale_fill_gradient(low = "#fff0f0", high="#940606") +
scale_y_reverse(breaks=c(23:0), labels=c(23:0), expand = c(0,0)) +
scale_x_discrete(expand = c(0,0), position = "top") +
labs(title = "Number of Started Events/Questions by Day of Week / Hour of Day", y = "Hour of Day") +
geom_text(aes(label = count_wday_hour), size = 2) +
theme_heatmap
gg2

Distribution of Time Spent per Event/Question with largest 5 % removed
df_clean = subset(df, time_spent<quantile(df$time_spent,0.95, na.rm=TRUE))
hist(df_clean$time_spent[!is.na(df_clean$time_spent)]/60, breaks=20, xlab = "Time Spent in Minutes", main = "Histogram of the Time Spent by Question")

Aggregated by Event/Question
Count of Old-New Value Pairs
df_stream <- df %>%
filter(!is.na(time_till_change)) %>%
count(question_decoded,
old_value_decoded,
new_value_decoded,
name="count_value_pairs",
sort=TRUE) %>%
filter(count_value_pairs > 1)
df_stream
Aggregated by Instance
Top 10 % of Duration by Instance
df_duration_per_inst <- df %>%
group_by(instance.ID) %>%
summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
filter(duration_per_inst>quantile(duration_per_inst, 0.9, na.rm=TRUE)) %>%
mutate(duration_per_inst = round(seconds_to_period(duration_per_inst))) %>%
arrange(desc(duration_per_inst))
df_duration_per_inst
| uuid:1fe2c870-db0a-4f66-86f4-8418f3a0372f |
43S |
Distribution of Duration by Instance with Top 10 % excluded
df_subsetted <- df %>%
group_by(instance.ID) %>%
summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
filter(duration_per_inst<quantile(duration_per_inst, 0.9, na.rm=TRUE))
hist(as.numeric(df_subsetted$duration_per_inst/60), breaks=30, main="Duration per Instance in Minutes (outliers removed)", xlab="Duration in Minutes")

Irregularities and Outliers
Time Till Change Outliers (for all data without removed outliers)
df_time_till_change_outliers <- df %>%
filter(time_till_change>quantile(df$time_till_change, 0.9, na.rm=TRUE)) %>%
arrange(desc(time_till_change)) %>%
mutate(time_till_change = round(seconds_to_period(time_till_change))) %>%
select(instance.ID,
question_decoded,
old_value_decoded,
new_value_decoded,
time_till_change)
df_time_till_change_outliers
Histograms of Instances with Inconsistent Filling Behaviour
irregular_inst = c()
for (id in unique(df$instance.ID)){
bin_vec = cut(df$start[df$instance.ID==id],
breaks=10,
labels=F)
if (length(unique(bin_vec)) < 5) irregular_inst = c(irregular_inst, id)
}
paste0(length(irregular_inst), " out of ", length(unique(df$instance.ID))," instances were found to have an inconsistent filling behaviour.")
## [1] "1 out of 4 instances were found to have an inconsistent filling behaviour."
last_bin_questions = c()
fig <- plot_ly(alpha=0.1)
for (id in irregular_inst){
temp_df = df[df$instance.ID==id,]
temp_df$cut = cut(temp_df$start, breaks=10, labels=c("1. Part", "2. Part", "3. Part", "4. Part", "5. Part", "6. Part", "7. Part", "8. Part", "9. Part", "10. Part"))
fig <- fig %>% add_histogram(x=temp_df$cut, name=id)
last_bin_questions = c(last_bin_questions, temp_df$question_decoded[temp_df$cut=="10. Part"])
}
fig <- fig %>% layout(barmode = "overlay")
fig
kable(table(last_bin_questions) %>% as.data.frame() %>% arrange(desc(Freq)))
| child_identification |
1 |
| If QR code scanning is not possible, please manually enter the participant identification code |
1 |
| step_end |
1 |
| step_other |
1 |
| summary |
1 |